// The libMesh Finite Element Library.
// Copyright (C) 2002-2018 Benjamin S. Kirk, John W. Peterson, Roy H. Stogner

// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.

// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.

// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


#ifndef LIBMESH_DENSE_MATRIX_IMPL_H
#define LIBMESH_DENSE_MATRIX_IMPL_H

// C++ Includes
#include <cstdlib> // *must* precede <cmath> for proper std:abs() on PGI, Sun Studio CC
#include <cmath> // for sqrt

// Local Includes
#include "libmesh/dense_matrix.h"
#include "libmesh/dense_vector.h"
#include "libmesh/libmesh.h"

namespace libMesh
{



// ------------------------------------------------------------
// Dense Matrix member functions

template<typename T>
void DenseMatrix<T>::left_multiply (const DenseMatrixBase<T> & M2)
{
  if (this->use_blas_lapack)
    this->_multiply_blas(M2, LEFT_MULTIPLY);
  else
    {
      // (*this) <- M2 * (*this)
      // Where:
      // (*this) = (m x n),
      // M2      = (m x p),
      // M3      = (p x n)

      // M3 is a copy of *this before it gets resize()d
      DenseMatrix<T> M3(*this);

      // Resize *this so that the result can fit
      this->resize (M2.m(), M3.n());

      // Call the multiply function in the base class
      this->multiply(*this, M2, M3);
    }
}



template<typename T>
template<typename T2>
void DenseMatrix<T>::left_multiply (const DenseMatrixBase<T2> & M2)
{
  // (*this) <- M2 * (*this)
  // Where:
  // (*this) = (m x n),
  // M2      = (m x p),
  // M3      = (p x n)

  // M3 is a copy of *this before it gets resize()d
  DenseMatrix<T> M3(*this);

  // Resize *this so that the result can fit
  this->resize (M2.m(), M3.n());

  // Call the multiply function in the base class
  this->multiply(*this, M2, M3);
}



template<typename T>
void DenseMatrix<T>::left_multiply_transpose(const DenseMatrix<T> & A)
{
  if (this->use_blas_lapack)
    this->_multiply_blas(A, LEFT_MULTIPLY_TRANSPOSE);
  else
    {
      //Check to see if we are doing (A^T)*A
      if (this == &A)
        {
          //libmesh_here();
          DenseMatrix<T> B(*this);

          // Simple but inefficient way
          // return this->left_multiply_transpose(B);

          // More efficient, but more code way
          // If A is mxn, the result will be a square matrix of Size n x n.
          const unsigned int n_rows = A.m();
          const unsigned int n_cols = A.n();

          // resize() *this and also zero out all entries.
          this->resize(n_cols,n_cols);

          // Compute the lower-triangular part
          for (unsigned int i=0; i<n_cols; ++i)
            for (unsigned int j=0; j<=i; ++j)
              for (unsigned int k=0; k<n_rows; ++k) // inner products are over n_rows
                (*this)(i,j) += B(k,i)*B(k,j);

          // Copy lower-triangular part into upper-triangular part
          for (unsigned int i=0; i<n_cols; ++i)
            for (unsigned int j=i+1; j<n_cols; ++j)
              (*this)(i,j) = (*this)(j,i);
        }

      else
        {
          DenseMatrix<T> B(*this);

          this->resize (A.n(), B.n());

          libmesh_assert_equal_to (A.m(), B.m());
          libmesh_assert_equal_to (this->m(), A.n());
          libmesh_assert_equal_to (this->n(), B.n());

          const unsigned int m_s = A.n();
          const unsigned int p_s = A.m();
          const unsigned int n_s = this->n();

          // Do it this way because there is a
          // decent chance (at least for constraint matrices)
          // that A.transpose(i,k) = 0.
          for (unsigned int i=0; i<m_s; i++)
            for (unsigned int k=0; k<p_s; k++)
              if (A.transpose(i,k) != 0.)
                for (unsigned int j=0; j<n_s; j++)
                  (*this)(i,j) += A.transpose(i,k)*B(k,j);
        }
    }

}



template<typename T>
template<typename T2>
void DenseMatrix<T>::left_multiply_transpose(const DenseMatrix<T2> & A)
{
  //Check to see if we are doing (A^T)*A
  if (this == &A)
    {
      //libmesh_here();
      DenseMatrix<T> B(*this);

      // Simple but inefficient way
      // return this->left_multiply_transpose(B);

      // More efficient, but more code way
      // If A is mxn, the result will be a square matrix of Size n x n.
      const unsigned int n_rows = A.m();
      const unsigned int n_cols = A.n();

      // resize() *this and also zero out all entries.
      this->resize(n_cols,n_cols);

      // Compute the lower-triangular part
      for (unsigned int i=0; i<n_cols; ++i)
        for (unsigned int j=0; j<=i; ++j)
          for (unsigned int k=0; k<n_rows; ++k) // inner products are over n_rows
            (*this)(i,j) += B(k,i)*B(k,j);

      // Copy lower-triangular part into upper-triangular part
      for (unsigned int i=0; i<n_cols; ++i)
        for (unsigned int j=i+1; j<n_cols; ++j)
          (*this)(i,j) = (*this)(j,i);
    }

  else
    {
      DenseMatrix<T> B(*this);

      this->resize (A.n(), B.n());

      libmesh_assert_equal_to (A.m(), B.m());
      libmesh_assert_equal_to (this->m(), A.n());
      libmesh_assert_equal_to (this->n(), B.n());

      const unsigned int m_s = A.n();
      const unsigned int p_s = A.m();
      const unsigned int n_s = this->n();

      // Do it this way because there is a
      // decent chance (at least for constraint matrices)
      // that A.transpose(i,k) = 0.
      for (unsigned int i=0; i<m_s; i++)
        for (unsigned int k=0; k<p_s; k++)
          if (A.transpose(i,k) != 0.)
            for (unsigned int j=0; j<n_s; j++)
              (*this)(i,j) += A.transpose(i,k)*B(k,j);
    }
}



template<typename T>
void DenseMatrix<T>::right_multiply (const DenseMatrixBase<T> & M3)
{
  if (this->use_blas_lapack)
    this->_multiply_blas(M3, RIGHT_MULTIPLY);
  else
    {
      // (*this) <- M3 * (*this)
      // Where:
      // (*this) = (m x n),
      // M2      = (m x p),
      // M3      = (p x n)

      // M2 is a copy of *this before it gets resize()d
      DenseMatrix<T> M2(*this);

      // Resize *this so that the result can fit
      this->resize (M2.m(), M3.n());

      this->multiply(*this, M2, M3);
    }
}



template<typename T>
template<typename T2>
void DenseMatrix<T>::right_multiply (const DenseMatrixBase<T2> & M3)
{
  // (*this) <- M3 * (*this)
  // Where:
  // (*this) = (m x n),
  // M2      = (m x p),
  // M3      = (p x n)

  // M2 is a copy of *this before it gets resize()d
  DenseMatrix<T> M2(*this);

  // Resize *this so that the result can fit
  this->resize (M2.m(), M3.n());

  this->multiply(*this, M2, M3);
}




template<typename T>
void DenseMatrix<T>::right_multiply_transpose (const DenseMatrix<T> & B)
{
  if (this->use_blas_lapack)
    this->_multiply_blas(B, RIGHT_MULTIPLY_TRANSPOSE);
  else
    {
      //Check to see if we are doing B*(B^T)
      if (this == &B)
        {
          //libmesh_here();
          DenseMatrix<T> A(*this);

          // Simple but inefficient way
          // return this->right_multiply_transpose(A);

          // More efficient, more code
          // If B is mxn, the result will be a square matrix of Size m x m.
          const unsigned int n_rows = B.m();
          const unsigned int n_cols = B.n();

          // resize() *this and also zero out all entries.
          this->resize(n_rows,n_rows);

          // Compute the lower-triangular part
          for (unsigned int i=0; i<n_rows; ++i)
            for (unsigned int j=0; j<=i; ++j)
              for (unsigned int k=0; k<n_cols; ++k) // inner products are over n_cols
                (*this)(i,j) += A(i,k)*A(j,k);

          // Copy lower-triangular part into upper-triangular part
          for (unsigned int i=0; i<n_rows; ++i)
            for (unsigned int j=i+1; j<n_rows; ++j)
              (*this)(i,j) = (*this)(j,i);
        }

      else
        {
          DenseMatrix<T> A(*this);

          this->resize (A.m(), B.m());

          libmesh_assert_equal_to (A.n(), B.n());
          libmesh_assert_equal_to (this->m(), A.m());
          libmesh_assert_equal_to (this->n(), B.m());

          const unsigned int m_s = A.m();
          const unsigned int p_s = A.n();
          const unsigned int n_s = this->n();

          // Do it this way because there is a
          // decent chance (at least for constraint matrices)
          // that B.transpose(k,j) = 0.
          for (unsigned int j=0; j<n_s; j++)
            for (unsigned int k=0; k<p_s; k++)
              if (B.transpose(k,j) != 0.)
                for (unsigned int i=0; i<m_s; i++)
                  (*this)(i,j) += A(i,k)*B.transpose(k,j);
        }
    }
}



template<typename T>
template<typename T2>
void DenseMatrix<T>::right_multiply_transpose (const DenseMatrix<T2> & B)
{
  //Check to see if we are doing B*(B^T)
  if (this == &B)
    {
      //libmesh_here();
      DenseMatrix<T> A(*this);

      // Simple but inefficient way
      // return this->right_multiply_transpose(A);

      // More efficient, more code
      // If B is mxn, the result will be a square matrix of Size m x m.
      const unsigned int n_rows = B.m();
      const unsigned int n_cols = B.n();

      // resize() *this and also zero out all entries.
      this->resize(n_rows,n_rows);

      // Compute the lower-triangular part
      for (unsigned int i=0; i<n_rows; ++i)
        for (unsigned int j=0; j<=i; ++j)
          for (unsigned int k=0; k<n_cols; ++k) // inner products are over n_cols
            (*this)(i,j) += A(i,k)*A(j,k);

      // Copy lower-triangular part into upper-triangular part
      for (unsigned int i=0; i<n_rows; ++i)
        for (unsigned int j=i+1; j<n_rows; ++j)
          (*this)(i,j) = (*this)(j,i);
    }

  else
    {
      DenseMatrix<T> A(*this);

      this->resize (A.m(), B.m());

      libmesh_assert_equal_to (A.n(), B.n());
      libmesh_assert_equal_to (this->m(), A.m());
      libmesh_assert_equal_to (this->n(), B.m());

      const unsigned int m_s = A.m();
      const unsigned int p_s = A.n();
      const unsigned int n_s = this->n();

      // Do it this way because there is a
      // decent chance (at least for constraint matrices)
      // that B.transpose(k,j) = 0.
      for (unsigned int j=0; j<n_s; j++)
        for (unsigned int k=0; k<p_s; k++)
          if (B.transpose(k,j) != 0.)
            for (unsigned int i=0; i<m_s; i++)
              (*this)(i,j) += A(i,k)*B.transpose(k,j);
    }
}




template<typename T>
void DenseMatrix<T>::vector_mult (DenseVector<T> & dest,
                                  const DenseVector<T> & arg) const
{
  // Make sure the input sizes are compatible
  libmesh_assert_equal_to (this->n(), arg.size());

  // Resize and clear dest.
  // Note: DenseVector::resize() also zeros the vector.
  dest.resize(this->m());

  // Short-circuit if the matrix is empty
  if(this->m() == 0 || this->n() == 0)
    return;

  if (this->use_blas_lapack)
    this->_matvec_blas(1., 0., dest, arg);
  else
    {
      const unsigned int n_rows = this->m();
      const unsigned int n_cols = this->n();

      for (unsigned int i=0; i<n_rows; i++)
        for (unsigned int j=0; j<n_cols; j++)
          dest(i) += (*this)(i,j)*arg(j);
    }
}



template<typename T>
template<typename T2>
void DenseMatrix<T>::vector_mult (DenseVector<typename CompareTypes<T,T2>::supertype> & dest,
                                  const DenseVector<T2> & arg) const
{
  // Make sure the input sizes are compatible
  libmesh_assert_equal_to (this->n(), arg.size());

  // Resize and clear dest.
  // Note: DenseVector::resize() also zeros the vector.
  dest.resize(this->m());

  // Short-circuit if the matrix is empty
  if (this->m() == 0 || this->n() == 0)
    return;

  const unsigned int n_rows = this->m();
  const unsigned int n_cols = this->n();

  for (unsigned int i=0; i<n_rows; i++)
    for (unsigned int j=0; j<n_cols; j++)
      dest(i) += (*this)(i,j)*arg(j);
}



template<typename T>
void DenseMatrix<T>::vector_mult_transpose (DenseVector<T> & dest,
                                            const DenseVector<T> & arg) const
{
  // Make sure the input sizes are compatible
  libmesh_assert_equal_to (this->m(), arg.size());

  // Resize and clear dest.
  // Note: DenseVector::resize() also zeros the vector.
  dest.resize(this->n());

  // Short-circuit if the matrix is empty
  if (this->m() == 0)
    return;

  if (this->use_blas_lapack)
    {
      this->_matvec_blas(1., 0., dest, arg, /*trans=*/true);
    }
  else
    {
      const unsigned int n_rows = this->m();
      const unsigned int n_cols = this->n();

      // WORKS
      // for (unsigned int j=0; j<n_cols; j++)
      //   for (unsigned int i=0; i<n_rows; i++)
      //     dest(j) += (*this)(i,j)*arg(i);

      // ALSO WORKS, (i,j) just swapped
      for (unsigned int i=0; i<n_cols; i++)
        for (unsigned int j=0; j<n_rows; j++)
          dest(i) += (*this)(j,i)*arg(j);
    }
}



template<typename T>
template<typename T2>
void DenseMatrix<T>::vector_mult_transpose (DenseVector<typename CompareTypes<T,T2>::supertype> & dest,
                                            const DenseVector<T2> & arg) const
{
  // Make sure the input sizes are compatible
  libmesh_assert_equal_to (this->m(), arg.size());

  // Resize and clear dest.
  // Note: DenseVector::resize() also zeros the vector.
  dest.resize(this->n());

  // Short-circuit if the matrix is empty
  if (this->m() == 0)
    return;

  const unsigned int n_rows = this->m();
  const unsigned int n_cols = this->n();

  // WORKS
  // for (unsigned int j=0; j<n_cols; j++)
  //   for (unsigned int i=0; i<n_rows; i++)
  //     dest(j) += (*this)(i,j)*arg(i);

  // ALSO WORKS, (i,j) just swapped
  for (unsigned int i=0; i<n_cols; i++)
    for (unsigned int j=0; j<n_rows; j++)
      dest(i) += (*this)(j,i)*arg(j);
}



template<typename T>
void DenseMatrix<T>::vector_mult_add (DenseVector<T> & dest,
                                      const T factor,
                                      const DenseVector<T> & arg) const
{
  // Short-circuit if the matrix is empty
  if (this->m() == 0)
    {
      dest.resize(0);
      return;
    }

  if (this->use_blas_lapack)
    this->_matvec_blas(factor, 1., dest, arg);
  else
    {
      DenseVector<T> temp(arg.size());
      this->vector_mult(temp, arg);
      dest.add(factor, temp);
    }
}



template<typename T>
template<typename T2, typename T3>
void DenseMatrix<T>::vector_mult_add (DenseVector<typename CompareTypes<T, typename CompareTypes<T2,T3>::supertype>::supertype> & dest,
                                      const T2 factor,
                                      const DenseVector<T3> & arg) const
{
  // Short-circuit if the matrix is empty
  if (this->m() == 0)
    {
      dest.resize(0);
      return;
    }

  DenseVector<typename CompareTypes<T,T3>::supertype>
    temp(arg.size());
  this->vector_mult(temp, arg);
  dest.add(factor, temp);
}



template<typename T>
void DenseMatrix<T>::get_principal_submatrix (unsigned int sub_m,
                                              unsigned int sub_n,
                                              DenseMatrix<T> & dest) const
{
  libmesh_assert( (sub_m <= this->m()) && (sub_n <= this->n()) );

  dest.resize(sub_m, sub_n);
  for (unsigned int i=0; i<sub_m; i++)
    for (unsigned int j=0; j<sub_n; j++)
      dest(i,j) = (*this)(i,j);
}



template<typename T>
void DenseMatrix<T>::outer_product (const DenseVector<T> & a,
                                    const DenseVector<T> & b)
{
  const unsigned int m = a.size();
  const unsigned int n = b.size();

  this->resize(m, n);
  for (unsigned int i = 0; i < m; ++i)
    for (unsigned int j = 0; j < n; ++j)
      (*this)(i,j) = a(i) * libmesh_conj(b(j));
}



template<typename T>
void DenseMatrix<T>::get_principal_submatrix (unsigned int sub_m, DenseMatrix<T> & dest) const
{
  get_principal_submatrix(sub_m, sub_m, dest);
}



template<typename T>
void DenseMatrix<T>::get_transpose (DenseMatrix<T> & dest) const
{
  dest.resize(this->n(), this->m());

  for (unsigned int i=0; i<dest.m(); i++)
    for (unsigned int j=0; j<dest.n(); j++)
      dest(i,j) = (*this)(j,i);
}




template<typename T>
void DenseMatrix<T>::lu_solve (const DenseVector<T> & b,
                               DenseVector<T> & x)
{
  // Check to be sure that the matrix is square before attempting
  // an LU-solve.  In general, one can compute the LU factorization of
  // a non-square matrix, but:
  //
  // Overdetermined systems (m>n) have a solution only if enough of
  // the equations are linearly-dependent.
  //
  // Underdetermined systems (m<n) typically have infinitely many
  // solutions.
  //
  // We don't want to deal with either of these ambiguous cases here...
  libmesh_assert_equal_to (this->m(), this->n());

  switch(this->_decomposition_type)
    {
    case NONE:
      {
        if (this->use_blas_lapack)
          this->_lu_decompose_lapack();
        else
          this->_lu_decompose ();
        break;
      }

    case LU_BLAS_LAPACK:
      {
        // Already factored, just need to call back_substitute.
        if (this->use_blas_lapack)
          break;
      }
      libmesh_fallthrough();

    case LU:
      {
        // Already factored, just need to call back_substitute.
        if (!(this->use_blas_lapack))
          break;
      }
      libmesh_fallthrough();

    default:
      libmesh_error_msg("Error! This matrix already has a different decomposition...");
    }

  if (this->use_blas_lapack)
    this->_lu_back_substitute_lapack (b, x);
  else
    this->_lu_back_substitute (b, x);
}






template<typename T>
void DenseMatrix<T>::_lu_back_substitute (const DenseVector<T> & b,
                                          DenseVector<T> & x ) const
{
  const unsigned int
    n_cols = this->n();

  libmesh_assert_equal_to (this->m(), n_cols);
  libmesh_assert_equal_to (this->m(), b.size());

  x.resize (n_cols);

  // A convenient reference to *this
  const DenseMatrix<T> & A = *this;

  // Temporary vector storage.  We use this instead of
  // modifying the RHS.
  DenseVector<T> z = b;

  // Lower-triangular "top to bottom" solve step, taking into account pivots
  for (unsigned int i=0; i<n_cols; ++i)
    {
      // Swap
      if (_pivots[i] != static_cast<pivot_index_t>(i))
        std::swap( z(i), z(_pivots[i]) );

      x(i) = z(i);

      for (unsigned int j=0; j<i; ++j)
        x(i) -= A(i,j)*x(j);

      x(i) /= A(i,i);
    }

  // Upper-triangular "bottom to top" solve step
  const unsigned int last_row = n_cols-1;

  for (int i=last_row; i>=0; --i)
    {
      for (int j=i+1; j<static_cast<int>(n_cols); ++j)
        x(i) -= A(i,j)*x(j);
    }
}








template<typename T>
void DenseMatrix<T>::_lu_decompose ()
{
  // If this function was called, there better not be any
  // previous decomposition of the matrix.
  libmesh_assert_equal_to (this->_decomposition_type, NONE);

  // Get the matrix size and make sure it is square
  const unsigned int
    n_rows = this->m();

  // A convenient reference to *this
  DenseMatrix<T> & A = *this;

  _pivots.resize(n_rows);

  for (unsigned int i=0; i<n_rows; ++i)
    {
      // Find the pivot row by searching down the i'th column
      _pivots[i] = i;

      // std::abs(complex) must return a Real!
      Real the_max = std::abs( A(i,i) );
      for (unsigned int j=i+1; j<n_rows; ++j)
        {
          Real candidate_max = std::abs( A(j,i) );
          if (the_max < candidate_max)
            {
              the_max = candidate_max;
              _pivots[i] = j;
            }
        }

      // libMesh::out << "the_max=" << the_max << " found at row " << _pivots[i] << std::endl;

      // If the max was found in a different row, interchange rows.
      // Here we interchange the *entire* row, in Gaussian elimination
      // you would only interchange the subrows A(i,j) and A(p(i),j), for j>i
      if (_pivots[i] != static_cast<pivot_index_t>(i))
        {
          for (unsigned int j=0; j<n_rows; ++j)
            std::swap( A(i,j), A(_pivots[i], j) );
        }


      // If the max abs entry found is zero, the matrix is singular
      if (A(i,i) == libMesh::zero)
        libmesh_error_msg("Matrix A is singular!");

      // Scale upper triangle entries of row i by the diagonal entry
      // Note: don't scale the diagonal entry itself!
      const T diag_inv = 1. / A(i,i);
      for (unsigned int j=i+1; j<n_rows; ++j)
        A(i,j) *= diag_inv;

      // Update the remaining sub-matrix A[i+1:m][i+1:m]
      // by subtracting off (the diagonal-scaled)
      // upper-triangular part of row i, scaled by the
      // i'th column entry of each row.  In terms of
      // row operations, this is:
      // for each r > i
      //   SubRow(r) = SubRow(r) - A(r,i)*SubRow(i)
      //
      // If we were scaling the i'th column as well, like
      // in Gaussian elimination, this would 'zero' the
      // entry in the i'th column.
      for (unsigned int row=i+1; row<n_rows; ++row)
        for (unsigned int col=i+1; col<n_rows; ++col)
          A(row,col) -= A(row,i) * A(i,col);

    } // end i loop

  // Set the flag for LU decomposition
  this->_decomposition_type = LU;
}



template<typename T>
void DenseMatrix<T>::svd (DenseVector<Real> & sigma)
{
  // We use the LAPACK svd implementation
  _svd_lapack(sigma);
}


template<typename T>
void DenseMatrix<T>::svd (DenseVector<Real> & sigma,
                          DenseMatrix<Number> & U,
                          DenseMatrix<Number> & VT)
{
  // We use the LAPACK svd implementation
  _svd_lapack(sigma, U, VT);
}



template<typename T>
void DenseMatrix<T>::svd_solve(const DenseVector<T> & rhs,
                               DenseVector<T> & x,
                               Real rcond) const
{
  _svd_solve_lapack(rhs, x, rcond);
}



template<typename T>
void DenseMatrix<T>::evd (DenseVector<T> & lambda_real,
                          DenseVector<T> & lambda_imag)
{
  // We use the LAPACK eigenvalue problem implementation
  _evd_lapack(lambda_real, lambda_imag);
}



template<typename T>
void DenseMatrix<T>::evd_left(DenseVector<T> & lambda_real,
                              DenseVector<T> & lambda_imag,
                              DenseMatrix<T> & VL)
{
  // We use the LAPACK eigenvalue problem implementation
  _evd_lapack(lambda_real, lambda_imag, &VL, nullptr);
}



template<typename T>
void DenseMatrix<T>::evd_right(DenseVector<T> & lambda_real,
                               DenseVector<T> & lambda_imag,
                               DenseMatrix<T> & VR)
{
  // We use the LAPACK eigenvalue problem implementation
  _evd_lapack(lambda_real, lambda_imag, nullptr, &VR);
}



template<typename T>
void DenseMatrix<T>::evd_left_and_right(DenseVector<T> & lambda_real,
                                        DenseVector<T> & lambda_imag,
                                        DenseMatrix<T> & VL,
                                        DenseMatrix<T> & VR)
{
  // We use the LAPACK eigenvalue problem implementation
  _evd_lapack(lambda_real, lambda_imag, &VL, &VR);
}



template<typename T>
T DenseMatrix<T>::det ()
{
  switch(this->_decomposition_type)
    {
    case NONE:
      {
        // First LU decompose the matrix.
        // Note that the lu_decompose routine will check to see if the
        // matrix is square so we don't worry about it.
        if (this->use_blas_lapack)
          this->_lu_decompose_lapack();
        else
          this->_lu_decompose ();
      }
    case LU:
    case LU_BLAS_LAPACK:
      {
        // Already decomposed, don't do anything
        break;
      }
    default:
      libmesh_error_msg("Error! Can't compute the determinant under the current decomposition.");
    }

  // A variable to keep track of the running product of diagonal terms.
  T determinant = 1.;

  // Loop over diagonal terms, computing the product.  In practice,
  // be careful because this value could easily become too large to
  // fit in a double or float.  To be safe, one should keep track of
  // the power (of 10) of the determinant in a separate variable
  // and maintain an order 1 value for the determinant itself.
  unsigned int n_interchanges = 0;
  for (unsigned int i=0; i<this->m(); i++)
    {
      if (this->_decomposition_type==LU)
        if (_pivots[i] != static_cast<pivot_index_t>(i))
          n_interchanges++;

      // Lapack pivots are 1-based!
      if (this->_decomposition_type==LU_BLAS_LAPACK)
        if (_pivots[i] != static_cast<pivot_index_t>(i+1))
          n_interchanges++;

      determinant *= (*this)(i,i);
    }

  // Compute sign of determinant, depends on number of row interchanges!
  // The sign should be (-1)^{n}, where n is the number of interchanges.
  Real sign = n_interchanges % 2 == 0 ? 1. : -1.;

  return sign*determinant;
}



// The cholesky solve function first decomposes the matrix
// with cholesky_decompose and then uses the cholesky_back_substitute
// routine to find the solution x.
template <typename T>
template <typename T2>
void DenseMatrix<T>::cholesky_solve (const DenseVector<T2> & b,
                                     DenseVector<T2> & x)
{
  // Check for a previous decomposition
  switch(this->_decomposition_type)
    {
    case NONE:
      {
        this->_cholesky_decompose ();
        break;
      }

    case CHOLESKY:
      {
        // Already factored, just need to call back_substitute.
        break;
      }

    default:
      libmesh_error_msg("Error! This matrix already has a different decomposition...");
    }

  // Perform back substitution
  this->_cholesky_back_substitute (b, x);
}




// This algorithm is based on the Cholesky decomposition in
// the Numerical Recipes in C book.
template<typename T>
void DenseMatrix<T>::_cholesky_decompose ()
{
  // If we called this function, there better not be any
  // previous decomposition of the matrix.
  libmesh_assert_equal_to (this->_decomposition_type, NONE);

  // Shorthand notation for number of rows and columns.
  const unsigned int
    n_rows = this->m(),
    n_cols = this->n();

  // Just to be really sure...
  libmesh_assert_equal_to (n_rows, n_cols);

  // A convenient reference to *this
  DenseMatrix<T> & A = *this;

  for (unsigned int i=0; i<n_rows; ++i)
    {
      for (unsigned int j=i; j<n_cols; ++j)
        {
          for (unsigned int k=0; k<i; ++k)
            A(i,j) -= A(i,k) * A(j,k);

          if (i == j)
            {
#ifndef LIBMESH_USE_COMPLEX_NUMBERS
              if (A(i,j) <= 0.0)
                libmesh_error_msg("Error! Can only use Cholesky decomposition with symmetric positive definite matrices.");
#endif

              A(i,i) = std::sqrt(A(i,j));
            }
          else
            A(j,i) = A(i,j) / A(i,i);
        }
    }

  // Set the flag for CHOLESKY decomposition
  this->_decomposition_type = CHOLESKY;
}



template <typename T>
template <typename T2>
void DenseMatrix<T>::_cholesky_back_substitute (const DenseVector<T2> & b,
                                                DenseVector<T2> & x) const
{
  // Shorthand notation for number of rows and columns.
  const unsigned int
    n_rows = this->m(),
    n_cols = this->n();

  // Just to be really sure...
  libmesh_assert_equal_to (n_rows, n_cols);

  // A convenient reference to *this
  const DenseMatrix<T> & A = *this;

  // Now compute the solution to Ax =b using the factorization.
  x.resize(n_rows);

  // Solve for Ly=b
  for (unsigned int i=0; i<n_cols; ++i)
    {
      T2 temp = b(i);

      for (unsigned int k=0; k<i; ++k)
        temp -= A(i,k)*x(k);

      x(i) = temp / A(i,i);
    }

  // Solve for L^T x = y
  for (unsigned int i=0; i<n_cols; ++i)
    {
      const unsigned int ib = (n_cols-1)-i;

      for (unsigned int k=(ib+1); k<n_cols; ++k)
        x(ib) -= A(k,ib) * x(k);

      x(ib) /= A(ib,ib);
    }
}








// This routine is commented out since it is not really a memory
// efficient implementation.  Also, you don't *need* the inverse
// for anything, instead just use lu_solve to solve Ax=b.
// template<typename T>
// void DenseMatrix<T>::inverse ()
// {
//   // First LU decompose the matrix
//   // Note that the lu_decompose routine will check to see if the
//   // matrix is square so we don't worry about it.
//   if (!this->_lu_decomposed)
//     this->_lu_decompose();

//   // A unit vector which will be used as a rhs
//   // to pick off a single value each time.
//   DenseVector<T> e;
//   e.resize(this->m());

//   // An empty vector which will be used to hold the solution
//   // to the back substitutions.
//   DenseVector<T> x;
//   x.resize(this->m());

//   // An empty dense matrix to store the resulting inverse
//   // temporarily until we can overwrite A.
//   DenseMatrix<T> inv;
//   inv.resize(this->m(), this->n());

//   // Resize the passed in matrix to hold the inverse
//   inv.resize(this->m(), this->n());

//   for (unsigned int j=0; j<this->n(); ++j)
//     {
//       e.zero();
//       e(j) = 1.;
//       this->_lu_back_substitute(e, x, false);
//       for (unsigned int i=0; i<this->n(); ++i)
// inv(i,j) = x(i);
//     }

//   // Now overwrite all the entries
//   *this = inv;
// }


} // namespace libMesh

#endif // LIBMESH_DENSE_MATRIX_IMPL_H
